# -*-Python-*-
# About 4x bigger than bi_v1_large.gin - 2.8B parameters
include 'models/bi_v1.gin'

d_model = 1024
num_layers = 24
d_ff = 16384
num_heads = 32
d_kv = 128
utils.tpu_mesh_shape.model_parallelism = 8
